# Import necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import os
import sys
import time
import math
import re
import string
from bs4 import BeautifulSoup
import pickle
import joblib
from joblib import dump, load
import nltk as nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import spacy
from spacy import displacy
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from pprint import pprint
import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
from bertopic import BERTopic
import torch
import tensorflow as tf
import ipywidgets
from pandarallel import pandarallel
import multiprocessing
from multiprocessing import Pool
import warnings
warnings.filterwarnings(action = 'ignore', category = UserWarning, module = 'gensim')
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
%matplotlib inline
2023-05-26 23:53:26.858603: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2023-05-26 23:53:29.504505: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64 2023-05-26 23:53:29.504631: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64 2023-05-26 23:53:29.504644: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. 2023-05-26 23:53:31.644557: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-05-26 23:53:31.647956: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-05-26 23:53:31.651216: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-05-26 23:53:31.654392: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-05-26 23:53:31.658602: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-05-26 23:53:31.661352: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-05-26 23:53:31.664076: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-05-26 23:53:31.666743: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-05-26 23:53:31.669509: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-05-26 23:53:31.672257: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-05-26 23:53:31.674936: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2023-05-26 23:53:31.677610: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
num_processors = multiprocessing.cpu_count()
workers = num_processors-1
print(f'Available CPUs: {num_processors}')
Available CPUs: 96
pandarallel.initialize(progress_bar=True, nb_workers=num_processors-1, use_memory_fs=False)
INFO: Pandarallel will run on 95 workers. INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')
[nltk_data] Downloading package stopwords to [nltk_data] /home/jupyter/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to /home/jupyter/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package vader_lexicon to [nltk_data] /home/jupyter/nltk_data... [nltk_data] Package vader_lexicon is already up-to-date!
True
%%time
df_yelp_sentiment = pd.read_parquet('df_yelp_sentiment.parquet', engine = 'pyarrow')
df_yelp_sentiment.shape
CPU times: user 4.79 s, sys: 3.04 s, total: 7.82 s Wall time: 7.72 s
(200284, 5)
df_yelp_sentiment.head()
| date | clean_title | clean_text | sentiment | sentiment_category | |
|---|---|---|---|---|---|
| 0 | 2021-03-18 | Artificial intelligence improves parking efficiency in Chinese cities - People's Daily Online | Artificial intelligence improves parking efficiency in Chinese cities - People's Daily Online Ho... | 1 | Positive |
| 1 | 2020-02-27 | Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Rob... | Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Rob... | 0 | Negative |
| 2 | 2021-03-26 | Forget ML, AI and Industry 4.0 – obsolescence should be your focus - 26 February 2021 - Test & R... | Forget ML, AI and Industry 4.0 – obsolescence should be your focus - 26 February 2021 - Test & R... | 0 | Negative |
| 3 | 2021-03-10 | Strategy Analytics: 71% of Smartphones Sold Globally in 2021 will be AI Powered – Consumer Elect... | Strategy Analytics: 71% of Smartphones Sold Globally in 2021 will be AI Powered – Consumer Elect... | 0 | Negative |
| 4 | 2020-10-20 | Olympus to Support Endoscopic AI Diagnosis Education for Doctors in India and to Launch AI Diagn... | Olympus to Support Endoscopic AI Diagnosis Education for Doctors in India and to Launch AI Diagn... | 0 | Negative |
df_topic_BERT = df_yelp_sentiment.copy()
df_topic_BERT.head()
| date | clean_title | clean_text | sentiment | sentiment_category | |
|---|---|---|---|---|---|
| 0 | 2021-03-18 | Artificial intelligence improves parking efficiency in Chinese cities - People's Daily Online | Artificial intelligence improves parking efficiency in Chinese cities - People's Daily Online Ho... | 1 | Positive |
| 1 | 2020-02-27 | Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Rob... | Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Rob... | 0 | Negative |
| 2 | 2021-03-26 | Forget ML, AI and Industry 4.0 – obsolescence should be your focus - 26 February 2021 - Test & R... | Forget ML, AI and Industry 4.0 – obsolescence should be your focus - 26 February 2021 - Test & R... | 0 | Negative |
| 3 | 2021-03-10 | Strategy Analytics: 71% of Smartphones Sold Globally in 2021 will be AI Powered – Consumer Elect... | Strategy Analytics: 71% of Smartphones Sold Globally in 2021 will be AI Powered – Consumer Elect... | 0 | Negative |
| 4 | 2020-10-20 | Olympus to Support Endoscopic AI Diagnosis Education for Doctors in India and to Launch AI Diagn... | Olympus to Support Endoscopic AI Diagnosis Education for Doctors in India and to Launch AI Diagn... | 0 | Negative |
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
# keep only letters, periods, and white space
df_topic_BERT['clean_text'] = df_topic_BERT['clean_text'].parallel_apply(lambda x: re.sub(r'[^a-zA-Z.\s]', '',x))
# change to lower case
df_topic_BERT['clean_text'] = df_topic_BERT['clean_text'].parallel_apply(lambda x: x.lower())
# remove stop words
df_topic_BERT['clean_text'] = df_topic_BERT['clean_text'].parallel_apply(lambda x: ' '.join([word for word in x.split()if word not in (stopwords)]))
VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2109), Label(value='0 / 2109'))), …
VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2109), Label(value='0 / 2109'))), …
VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2109), Label(value='0 / 2109'))), …
skip_list = ['Data Science', 'data science', 'DATA SCIENCE', 'AI', 'ai', 'artificial intelligence', 'Artificial Intelligence', 'ARTIFICIAL INTELLIGENCE', 'ML', 'NLP',
'Artificial General Intelligence', 'Chatbots', 'AI Marketplaces', 'Intelligent Applications', 'Augmented Intelligence', 'Decision Intelligence',
'AI Cloud Services', 'cloud services', 'GPU Accelerators', 'Computer Vision', 'Deep Neural Network', 'Deep Learning', 'Cognitive Computing',
'Autonomous Vehicles', 'Knowledge Graphs', 'Responsible AI', 'Machine Customers', 'Decision Intelligence', 'Autonomous Vehicles', 'Human-Centered AI',
'AI Governance', 'Natural Language Processing', 'Machine Learning', 'Smart Robots', 'Operational AI Systems', 'Data-Centric AI', 'AI TRiSM',
'Generative AI', 'Responsible AI']
stopwords = set(STOPWORDS)
stopwords.update(skip_list)
df_topic_BERT['clean_text'] = df_topic_BERT['clean_text'].parallel_apply(lambda x: ' '.join([word for word in x.split()if word not in (stopwords)]))
VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2109), Label(value='0 / 2109'))), …
df_topic_BERT.head()
| date | clean_title | clean_text | sentiment | sentiment_category | |
|---|---|---|---|---|---|
| 0 | 2021-03-18 | Artificial intelligence improves parking efficiency in Chinese cities - People's Daily Online | artificial intelligence improves parking efficiency chinese cities peoples daily online home chi... | 1 | Positive |
| 1 | 2020-02-27 | Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Rob... | children autism saw learning social skills boosted playing robot news parliament skip content th... | 0 | Negative |
| 2 | 2021-03-26 | Forget ML, AI and Industry 4.0 – obsolescence should be your focus - 26 February 2021 - Test & R... | forget ml industry . obsolescence focus february test rework solutions dataweek home us back iss... | 0 | Negative |
| 3 | 2021-03-10 | Strategy Analytics: 71% of Smartphones Sold Globally in 2021 will be AI Powered – Consumer Elect... | strategy analytics smartphones sold globally powered consumer electronics net skip content consu... | 0 | Negative |
| 4 | 2020-10-20 | Olympus to Support Endoscopic AI Diagnosis Education for Doctors in India and to Launch AI Diagn... | olympus support endoscopic diagnosis education doctors india launch diagnostic support applicati... | 0 | Negative |
X = df_topic_BERT['clean_text'].values
%%time
model = BERTopic(verbose=True, nr_topics= 8, n_gram_range=(1, 3))
topics, probabilities = model.fit_transform(X)
# 1hr 42mins
Downloading (…)e9125/.gitattributes: 0%| | 0.00/1.18k [00:00<?, ?B/s]
Downloading (…)_Pooling/config.json: 0%| | 0.00/190 [00:00<?, ?B/s]
Downloading (…)7e55de9125/README.md: 0%| | 0.00/10.6k [00:00<?, ?B/s]
Downloading (…)55de9125/config.json: 0%| | 0.00/612 [00:00<?, ?B/s]
Downloading (…)ce_transformers.json: 0%| | 0.00/116 [00:00<?, ?B/s]
Downloading (…)125/data_config.json: 0%| | 0.00/39.3k [00:00<?, ?B/s]
Downloading pytorch_model.bin: 0%| | 0.00/90.9M [00:00<?, ?B/s]
Downloading (…)nce_bert_config.json: 0%| | 0.00/53.0 [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 0%| | 0.00/112 [00:00<?, ?B/s]
Downloading (…)e9125/tokenizer.json: 0%| | 0.00/466k [00:00<?, ?B/s]
Downloading (…)okenizer_config.json: 0%| | 0.00/350 [00:00<?, ?B/s]
Downloading (…)9125/train_script.py: 0%| | 0.00/13.2k [00:00<?, ?B/s]
Downloading (…)7e55de9125/vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]
Downloading (…)5de9125/modules.json: 0%| | 0.00/349 [00:00<?, ?B/s]
Batches: 0%| | 0/6259 [00:00<?, ?it/s]
2023-05-27 00:09:51,390 - BERTopic - Transformed documents to Embeddings 2023-05-27 00:13:37,859 - BERTopic - Reduced dimensionality
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2023-05-27 00:14:02,305 - BERTopic - Clustered reduced embeddings 2023-05-27 00:58:34,837 - BERTopic - Reduced number of topics from 3939 to 8
CPU times: user 4h 58min 44s, sys: 1h 27s, total: 5h 59min 12s Wall time: 1h 3s
# Save model as a file using pickle
with open('bert_model.pkl', 'wb') as f:
pickle.dump(model, f)
# Load model from the saved file
with open('bert_model.pkl', 'rb') as f:
model = pickle.load(f)
model.visualize_barchart()
model.visualize_topics()
model.get_topic_info()
| Topic | Count | Name | |
|---|---|---|---|
| 0 | -1 | 49914 | -1_market_news_us_data |
| 1 | 0 | 150062 | 0_market_news_us_data |
| 2 | 1 | 163 | 1_javascript_javascript disabled current_current browser configurationis_seeking alpha javascript |
| 3 | 2 | 47 | 2_us_exports_export_analyzing satellite |
| 4 | 3 | 40 | 3_enter valid_valid_enter_axios |
| 5 | 4 | 30 | 4_shielding_shielding materials_emi_market |
| 6 | 5 | 17 | 5_market_post covid update_covid update global_update global |
| 7 | 6 | 11 | 6_center backup software_center backup_backup software_data center backup |
topics_keywords = model.get_topics()
keywords_df_list = []
for i in range(len(topics_keywords)-1):
topic_df = pd.DataFrame(topics_keywords[i])
keywords_df_list.append(topic_df)
keywords_list = pd.concat(keywords_df_list)[0]
keywords_list_br = [keywords_list[i:i + 10] for i in range(0, len(keywords_list), 10)]
keywords_df = pd.DataFrame(keywords_list_br, index=['Topic_0', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_4', 'Topic_5', 'Topic_6'])
# keywords_df.to_csv("keywords_BERT.csv", index=False)
keywords_df2 = pd.read_csv("keywords_BERT.csv")
keywords_df2
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | market | news | us | data | new | intelligence | artificial | artificial intelligence | technology | business |
| 1 | javascript | javascript disabled current | current browser configurationis | seeking alpha javascript | browser configurationis compatible | configurationis compatible | configurationis compatible site | browser configurationis | configurationis | compatible site |
| 2 | us | exports | export | analyzing satellite | satellite images | satellite | new | rules | analyzing satellite images | news |
| 3 | enter valid | valid | enter | axios | enter valid emailsubscription | emailsubscription | valid emailsubscription | addressplease enter valid | enter valid emailplease | valid emailplease |
| 4 | shielding | shielding materials | emi | market | shielding materials technologies | rfi shielding materials | rfi shielding | materials technologies | emi rfi shielding | emi rfi |
| 5 | market | post covid update | covid update global | update global | covid update | post covid | global | covid | update | post |
| 6 | center backup software | center backup | backup software | data center backup | backup | data center | backup software market | center | market | software |
df_topic_BERT['Topic'] = topics
df_topic_BERT.to_csv("topics_BERT.csv", index = False)
df_topic_BERT = df_topic_BERT[df_topic_BERT["Topic"]!=-1]
df_topic_BERT.head()
| date | clean_title | clean_text | sentiment | sentiment_category | Topic | |
|---|---|---|---|---|---|---|
| 6 | 2020-12-08 | From the Bard to broadcaster: Stratford Festival builds new identity with streamer | National En... | bard broadcaster stratford festival builds new identity streamer national entertainment penticto... | 1 | Positive | 0 |
| 9 | 2020-06-14 | Artificial Intelligence In Behavioral And Mental Health Care Market to Witness Astonishing Growt... | artificial intelligence behavioral mental health care market witness astonishing growth focusing... | 0 | Negative | 0 |
| 10 | 2020-07-10 | AI/ Machine Learning Market 2020 Expected to Reach $XX Million by 2024 – IBM, BAIDU, SOUNDHOUND,... | machine learning market expected reach xx million ibm baidu soundhound zebra medical vision pris... | 1 | Positive | 0 |
| 11 | 2020-03-16 | According to Latest Report on Machine Learning Courses Market to Grow with an Impressive CAGR: T... | according latest report machine learning courses market grow impressive cagr top key players edx... | 1 | Positive | 0 |
| 14 | 2023-04-06 | Video Trump deepfakes on social media prompt warnings of AI risks - ABC News | video trump deepfakes social media prompt warnings risks abc news abc newsvideoliveshowsguns ame... | 1 | Positive | 0 |
# Group data by sentiment_category and Topic columns and get size of each group
grouped = df_topic_BERT.groupby(['sentiment_category', 'Topic']).size().reset_index(name='count')
# Pivot the table to have sentiment_category as columns, Topic as index, and count as values
pivot_table = grouped.pivot(index='Topic', columns='sentiment_category', values='count')
# Plot the pivot table as a stacked bar chart
pivot_table.plot(kind='bar', stacked=True)
plt.show()
df_topic_BERT['Topic_category'] = df_topic_BERT['Topic'].map({0: 'Technology insights & news', 1: 'Browser_script', 2:'Technology product & service', 3:'Shielding Materials',
4:'Covid_related', 5:'classesntry', 6:'Text-related', 7:'map florida results'})
df_topic_BERT["Topic"].value_counts()
Topic 0 150062 1 163 2 47 3 40 4 30 5 17 6 11 Name: count, dtype: int64
Articles that have a negative sentiment
df_neg_BERT = df_topic_BERT[df_topic_BERT['sentiment_category']=='Negative']
df_neg_BERT = df_neg_BERT[['clean_text']].drop_duplicates()
X_neg = df_neg_BERT['clean_text'].values
X_neg.shape
(99313,)
%%time
model_neg_topics = BERTopic(verbose=True, nr_topics= 10, n_gram_range=(1, 3))
neg_topics, neg_probabilities = model_neg_topics.fit_transform(X_neg)
Batches: 0%| | 0/3104 [00:00<?, ?it/s]
2023-05-27 01:20:47,190 - BERTopic - Transformed documents to Embeddings 2023-05-27 01:22:12,397 - BERTopic - Reduced dimensionality
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2023-05-27 01:22:23,924 - BERTopic - Clustered reduced embeddings 2023-05-27 01:45:35,303 - BERTopic - Reduced number of topics from 2594 to 10
CPU times: user 2h 28min 3s, sys: 18min 44s, total: 2h 46min 47s Wall time: 30min 27s
model_neg_topics.visualize_barchart()
import matplotlib.pyplot as plt
import numpy as np
# Create a colormap
colors = plt.cm.get_cmap('hsv', len(topics['Topic'][:6]))
fig, ax = plt.subplots(figsize=(15, 10)) # Change figure size as per your requirement
# Width of the bars
width = 0.1
# Iterate over each topic
for i, topic in enumerate(topics['Topic'][:6]):
# Get topic words and their probabilities
topic_words = model_neg_topics.get_topic(topic)
words = [word[0] for word in topic_words]
probs = [word[1] for word in topic_words]
# Generate bar positions
bar_positions = np.arange(len(words)) + i * width
# Create bar chart for this topic
ax.bar(bar_positions, probs, width, label=f'Topic {topic}', color=colors(i))
# Add labels, title, and legend
ax.set_xlabel('Words')
ax.set_ylabel('Probabilities')
ax.set_title('Topic Modeling')
ax.set_xticks(bar_positions - width * (len(topics['Topic'][:6]) - 1) / 2)
ax.set_xticklabels(words)
plt.xticks(rotation=90)
ax.legend()
plt.show()
model_neg_topics.visualize_topics()
topics_keywords = model_neg_topics.get_topics()
keywords_df_list = []
for i in range(len(topics_keywords)-1):
topic_df = pd.DataFrame(topics_keywords[i])
keywords_df_list.append(topic_df)
keywords_list = pd.concat(keywords_df_list)[0]
keywords_list_br = [keywords_list[i:i + 10] for i in range(0, len(keywords_list), 10)]
keywords_df = pd.DataFrame(keywords_list_br, index=['Topic_0', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_4', 'Topic_5', 'Topic_6', 'Topic_7', 'Topic_8'])
keywords_df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| Topic_0 | market | news | us | data | new | intelligence | technology | artificial | business | artificial intelligence |
| Topic_1 | market | report | analysis | global | catheter | growth | research | medical | vascular | urinary |
| Topic_2 | data | learning | machine | science | data science | deep learning | developer | deep | course | machine learning |
| Topic_3 | american | middle eastern | eastern | native | native american | african american | asian | hispanic | eastern native american | middle eastern native |
| Topic_4 | maritime | maritime risk | maritime risk analysis | risk analysis | capability | gsts | risk | enhance maritime | enhance maritime risk | solutions |
| Topic_5 | maine | maine public | public | maine public television | public television | maine public classical | public classical | classical | podcast | watch |
| Topic_6 | likedataidtypetypestrcommentstatewithuserscontentidsubsiteidnameuuuuuuubavatarhttpsleonardoosnov... | likedataidtypetypestrcommentstatewithuserscontentidsubsiteidnameuuuuuuubavatarhttpsleonardoosnov... | likedataidtypetypestrcommentstatewithuserscontentidsubsiteidnameuuuuuuubavatarhttpsleonardoosnov... | likedataidtypetypestrcommentstatewithuserscontentidsubsiteidnameuuuuuuubavatarhttpsleonardoosnov... | likedataidtypetypestrcommentstatewithuserscontentidsubsiteidnameuuuuuuubavatarhttpsleonardoosnov... | likedataidtypetypestrcommentstatewithuserscontentidsubsiteidnameuuuuuuubavatarhttpsleonardoosnov... | null | chatgpt | likedataidtypetypestrcommentstatewithuserscontentidsubsiteidnameuuuuuuuavatarhttpsleonardoosnova... | likedataidtypetypestrcommentstatewithuserscontentidsubsiteidnameuuuuuuuavatarhttpsleonardoosnova... |
| Topic_7 | post covid update | market | covid update global | update global | covid update | post covid | covid | global | update | post |
| Topic_8 | assistive | elderly assistive | disabled elderly assistive | disabled elderly | elderly | disabled | market | assistive technology | elderly assistive technology | assistive technology market |
Articles that have a positive sentiment
df_pos_BERT = df_topic_BERT[df_topic_BERT['sentiment_category']=='Positive']
df_pos_BERT = df_pos_BERT[['clean_text']].drop_duplicates()
X_pos = df_pos_BERT['clean_text'].values
X_pos.shape
(49462,)
%%time
model_pos_topics = BERTopic(verbose=True, nr_topics= 10, n_gram_range=(1, 3))
pos_topics, pos_probabilities = model_pos_topics.fit_transform(X_pos)
Batches: 0%| | 0/1546 [00:00<?, ?it/s]
2023-05-27 02:07:15,861 - BERTopic - Transformed documents to Embeddings 2023-05-27 02:07:56,752 - BERTopic - Reduced dimensionality
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2023-05-27 02:08:03,546 - BERTopic - Clustered reduced embeddings 2023-05-27 02:20:28,615 - BERTopic - Reduced number of topics from 1289 to 10
CPU times: user 1h 8min 58s, sys: 7min 1s, total: 1h 15min 59s Wall time: 16min 11s
model_pos_topics.visualize_barchart()
model_pos_topics.visualize_topics()
topics_keywords = model_pos_topics.get_topics()
keywords_df_list = []
for i in range(len(topics_keywords)-1):
topic_df = pd.DataFrame(topics_keywords[i])
keywords_df_list.append(topic_df)
keywords_list = pd.concat(keywords_df_list)[0]
keywords_list_br = [keywords_list[i:i + 10] for i in range(0, len(keywords_list), 10)]
keywords_df = pd.DataFrame(keywords_list_br, index=['Topic_0', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_4', 'Topic_5', 'Topic_6', 'Topic_7', 'Topic_8'])
keywords_df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| Topic_0 | market | intelligence | news | artificial | us | artificial intelligence | data | global | report | new |
| Topic_1 | market | report | global | analysis | medical | research | growth | catheters | key | vascular |
| Topic_2 | gpu | nvidia | performance | rtx | modules | mxm gpu | gpu modules | mxm | dihuni | mxm gpu modules |
| Topic_3 | camera | camera market | market | fever | detector thermal | detector thermal camera | thermal camera | pixels | fever detector thermal | fever detector |
| Topic_4 | gartner | enter valid | testing validation | valid | cool | independent testing validation | independent testing | calypsoai | validation | testing |
| Topic_5 | market | hemostat | disposable hemostat | disposable | global | report | brachytherapy | analysis | growth | seeds market |
| Topic_6 | javascript | disabledplease | disabledplease enable | javascript currently disabledplease | currently disabledplease | currently disabledplease enable | jumi | disabledplease enable better | experience jumi | better experience jumi |
| Topic_7 | thrive global | thrive | global | use | people | intelligence | businesses | work | facebookfollow thrive | facebookfollow thrive global |
| Topic_8 | shielding | shielding materials | emi | shielding materials technologies | rfi shielding materials | rfi shielding | materials technologies | emi rfi | emi rfi shielding | rfi |
# group the DataFrame by sentiment and topic category, count the occurrences of clean_text, and reset the index
temp = df_topic_BERT.groupby(['sentiment_category', 'Topic_category'])['clean_text'].count().reset_index()
# rename the count column to reflect the sentiment category and set the index to topic category
temp = temp.rename(columns={'clean_text': 'count'}).set_index('Topic_category')
# filter the DataFrame by sentiment category and rename the count column
temp_neg = temp[temp['sentiment_category'] == 'Negative'].rename(columns={'count': 'count_of_negative_articles'})
temp_pos = temp[temp['sentiment_category'] == 'Positive'].rename(columns={'count': 'count_of_positive_articles'})
# concatenate the two DataFrames and select only the relevant columns
topics_by_sentiment = pd.concat([temp_neg, temp_pos], axis=1)[['count_of_negative_articles', 'count_of_positive_articles']]
# sort the DataFrame by the count of positive articles in descending order
topics_by_sentiment = topics_by_sentiment.sort_values(by='count_of_positive_articles', ascending=False)
# Dataframe showing number of positive and negative articles respectively per topic
topics_by_sentiment
| count_of_negative_articles | count_of_positive_articles | |
|---|---|---|
| Topic_category | ||
| Technology insights & news | 100253 | 49809 |
| Browser_script | 121 | 42 |
| Covid_related | 13 | 17 |
| Technology product & service | 30 | 17 |
| Shielding Materials | 26 | 14 |
| Text-related | 5 | 6 |
| classesntry | 14 | 3 |
Analyzing certain companies
1) ChatGPT
# define the keywords to search for
keywords = ['chatGPT','chatgpt']
# join the keywords using the OR operator '|'
query = '|'.join(keywords)
# filter the DataFrame by the query string
tp_chatGPT = df_topic_BERT[df_topic_BERT['clean_title'].str.contains(query)]
# count the occurrences of sentiment category and transpose the DataFrame
cnt_chatGPT = pd.DataFrame(tp_chatGPT['sentiment_category'].value_counts()).rename(columns={'sentiment_category': 'chatGPT'}).T
# calculate the total number of articles and add it as a new column
cnt_chatGPT['total'] = cnt_chatGPT['Negative'] + cnt_chatGPT['Positive']
# calculate the percentage of negative and positive articles and round to two decimal places
cnt_chatGPT['Negative%'] = np.round(100 * cnt_chatGPT['Negative'] / cnt_chatGPT['total'], 2)
cnt_chatGPT['Positive%'] = np.round(100 * cnt_chatGPT['Positive'] / cnt_chatGPT['total'], 2)
cnt_chatGPT
| sentiment_category | Negative | Positive | total | Negative% | Positive% |
|---|---|---|---|---|---|
| count | 27 | 23 | 50 | 54.0 | 46.0 |
2) Google
# define the keywords to search for
keywords = ['google', 'Google']
# join the keywords using the OR operator '|'
query = '|'.join(keywords)
# filter the DataFrame by the query string
tp_google = df_topic_BERT[df_topic_BERT['clean_title'].str.contains(query)]
# count the occurrences of sentiment category and transpose the DataFrame
cnt_google = pd.DataFrame(tp_google['sentiment_category'].value_counts()).rename(columns={'sentiment_category': 'Google'}).T
# calculate the total number of articles and add it as a new column
cnt_google['total'] = cnt_google['Negative'] + cnt_google['Positive']
# calculate the percentage of negative and positive articles and round to two decimal places
cnt_google['Negative%'] = np.round(100 * cnt_google['Negative'] / cnt_google['total'], 2)
cnt_google['Positive%'] = np.round(100 * cnt_google['Positive'] / cnt_google['total'], 2)
cnt_google
| sentiment_category | Negative | Positive | total | Negative% | Positive% |
|---|---|---|---|---|---|
| count | 5981 | 2508 | 8489 | 70.46 | 29.54 |
3) Microsoft
# define the keywords to search for
keywords = ['microsoft', 'Microsoft', 'MS', 'ms']
# join the keywords using the OR operator '|'
query = '|'.join(keywords)
# filter the DataFrame by the query string
tp_microsoft = df_topic_BERT[df_topic_BERT['clean_title'].str.contains(query)]
# count the occurrences of sentiment category and transpose the DataFrame
cnt_microsoft = pd.DataFrame(tp_microsoft['sentiment_category'].value_counts()).rename(columns={'sentiment_category': 'Microsoft'}).T
# calculate the total number of articles and add it as a new column
cnt_microsoft['total'] = cnt_microsoft['Negative'] + cnt_microsoft['Positive']
# calculate the percentage of negative and positive articles and round to two decimal places
cnt_microsoft['Negative%'] = np.round(100 * cnt_microsoft['Negative'] / cnt_microsoft['total'], 2)
cnt_microsoft['Positive%'] = np.round(100 * cnt_microsoft['Positive'] / cnt_microsoft['total'], 2)
cnt_microsoft
| sentiment_category | Negative | Positive | total | Negative% | Positive% |
|---|---|---|---|---|---|
| count | 10401 | 5555 | 15956 | 65.19 | 34.81 |